{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "from collections import namedtuple, defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sns.set_context(\"poster\")\n",
    "sns.set_style(\"ticks\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "DATA_DIR=\"data/data/\"\n",
    "CLEANED_DIR=\"data/cleaned/\"\n",
    "\n",
    "Tag = namedtuple(\"Tag\", [\"token\", \"tag\"])\n",
    "\n",
    "def load_sequences(filename, sep=\"\\t\", notypes=False):\n",
    "    tag_count = defaultdict(int)\n",
    "    sequences = []\n",
    "    with open(filename) as fp:\n",
    "        seq = []\n",
    "        for line in fp:\n",
    "            line = line.strip()\n",
    "            if line:\n",
    "                line = line.split(sep)\n",
    "                if notypes:\n",
    "                    line[1] = line[1][0]\n",
    "                tag_count[line[1]] += 1\n",
    "                #print line\n",
    "                seq.append(Tag(*line))\n",
    "            else:\n",
    "                sequences.append(seq)\n",
    "                seq = []\n",
    "        if seq:\n",
    "            sequences.append(seq)\n",
    "    return sequences, tag_count\n",
    "\n",
    "def write_sequences(sequences, filename, sep=\"\\t\", to_bieou=True):\n",
    "    with open(filename, \"wb+\") as fp:\n",
    "        for seq in sequences:\n",
    "            if to_bieou:\n",
    "                seq = to_BIEOU(seq)\n",
    "            for tag in seq:\n",
    "                print >> fp, sep.join(tag)\n",
    "            print >> fp, \"\"                \n",
    "                \n",
    "def count_phrases(ptype=\"movie\"):\n",
    "    phrase_counts = defaultdict(int)\n",
    "    check_tag = ptype\n",
    "    for seq in sequences:\n",
    "        phrase = \"\"\n",
    "        for tag in seq:\n",
    "            if not phrase and tag.tag == \"B-%s\" % check_tag:\n",
    "                phrase = tag.token\n",
    "                continue\n",
    "            if tag.tag == \"I-%s\" % check_tag:\n",
    "                phrase += \" %s\" % tag.token\n",
    "                continue\n",
    "            if phrase:\n",
    "                phrase_counts[phrase] += 1\n",
    "                phrase = \"\"\n",
    "    return phrase_counts\n",
    "\n",
    "\n",
    "def phrase_to_BIEOU(phrase):\n",
    "    l = len(phrase)\n",
    "    new_phrase = []\n",
    "    for j, t in enumerate(phrase):\n",
    "        new_tag = t.tag\n",
    "        if l == 1:\n",
    "            new_tag = \"U%s\" % t.tag[1:]\n",
    "        elif j == l-1:\n",
    "            new_tag = \"E%s\" % t.tag[1:]\n",
    "        new_phrase.append(Tag(t.token, new_tag))\n",
    "    return new_phrase\n",
    "\n",
    "def to_BIEOU(seq, verbose=False):\n",
    "    # TAGS B I E U O\n",
    "    phrase = []\n",
    "    new_seq = []\n",
    "    for i, tag in enumerate(seq):\n",
    "        if not phrase and tag.tag[0] == \"B\":\n",
    "            phrase.append(tag)\n",
    "            continue\n",
    "        if tag.tag[0] == \"I\":\n",
    "            phrase.append(tag)\n",
    "            continue\n",
    "        if phrase:\n",
    "            if verbose:\n",
    "                print \"Editing phrase\", phrase\n",
    "            new_phrase = phrase_to_BIEOU(phrase)\n",
    "            new_seq.extend(new_phrase)\n",
    "            phrase = []\n",
    "        new_seq.append(tag)\n",
    "    if phrase:\n",
    "        if verbose:\n",
    "            print \"Editing phrase\", phrase\n",
    "        new_phrase = phrase_to_BIEOU(phrase)\n",
    "        new_seq.extend(new_phrase)\n",
    "        phrase = []\n",
    "    return new_seq"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sequences, tag_count = load_sequences(\"data/data/train\", sep=\"\\t\")\n",
    "write_sequences(sequences, \"data/cleaned/train.tsv\", to_bieou=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sequences, tag_count = load_sequences(\"data/data/dev\", sep=\"\\t\")\n",
    "write_sequences(sequences, \"data/cleaned/dev.tsv\", to_bieou=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sequences, tag_count = load_sequences(\"data/data/dev_2015\", sep=\"\\t\")\n",
    "write_sequences(sequences, \"data/cleaned/dev_2015.tsv\", to_bieou=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sequences, tag_count = load_sequences(\"data/data/test.txt\", sep=\"\\t\")\n",
    "write_sequences(sequences, \"data/cleaned/test.tsv\", to_bieou=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sequences, tag_count = load_sequences(\"data/cleaned/train.tsv\", sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2394"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "46469"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(len(seq) for seq in sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Tag(token='good', tag='O'),\n",
       " Tag(token='friday', tag='O'),\n",
       " Tag(token='whatchu', tag='O'),\n",
       " Tag(token='got', tag='O'),\n",
       " Tag(token='for', tag='O'),\n",
       " Tag(token='me', tag='O'),\n",
       " Tag(token='@kanyewest', tag='O')]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {'B-company': 171,\n",
       "             'B-facility': 104,\n",
       "             'B-geo-loc': 276,\n",
       "             'B-movie': 34,\n",
       "             'B-musicartist': 55,\n",
       "             'B-other': 225,\n",
       "             'B-person': 449,\n",
       "             'B-product': 97,\n",
       "             'B-sportsteam': 51,\n",
       "             'B-tvshow': 34,\n",
       "             'I-company': 36,\n",
       "             'I-facility': 105,\n",
       "             'I-geo-loc': 49,\n",
       "             'I-movie': 46,\n",
       "             'I-musicartist': 61,\n",
       "             'I-other': 320,\n",
       "             'I-person': 215,\n",
       "             'I-product': 80,\n",
       "             'I-sportsteam': 23,\n",
       "             'I-tvshow': 31,\n",
       "             'O': 44007})"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tag_count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {'Agora': 1,\n",
       "             'Alpha &amp; Omega': 1,\n",
       "             'Breaking Dawn': 1,\n",
       "             'Dazed &amp; Confused': 1,\n",
       "             'Devil': 1,\n",
       "             'Easy A': 1,\n",
       "             'Gigli': 2,\n",
       "             'Half baked': 1,\n",
       "             'Iron Man 2': 1,\n",
       "             'JENNIFERS BODY': 1,\n",
       "             'Kick-Ass': 1,\n",
       "             'Knight and Day': 1,\n",
       "             'Les Miserables': 1,\n",
       "             'Mar Adentro': 1,\n",
       "             'Paranormal Activity 2': 1,\n",
       "             'Piranha 3D': 1,\n",
       "             'Princess Lover OVA 1': 1,\n",
       "             'Sex In The City': 1,\n",
       "             'The Ride': 1,\n",
       "             'The Room': 1,\n",
       "             'The Sea Inside': 1,\n",
       "             'The Town': 3,\n",
       "             'Toy story 3': 1,\n",
       "             \"Winter 's Bone\": 2,\n",
       "             'camp rock 2': 1,\n",
       "             'grandma boys': 1,\n",
       "             'how high': 1,\n",
       "             'nightmare before christmas': 1,\n",
       "             'puff puff pass': 1,\n",
       "             'rocky horror show': 1})"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "phrase_counts = count_phrases(ptype=\"movie\")\n",
    "phrase_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {' Mason': 1,\n",
       "             '@Cromwell Field': 1,\n",
       "             'ASU Step Gallery': 1,\n",
       "             'BAR': 1,\n",
       "             'Band hall': 1,\n",
       "             'Botanic Gdns': 1,\n",
       "             'Bowl Long Island': 1,\n",
       "             'Burning Bush Grille': 1,\n",
       "             'CAFE NINE': 1,\n",
       "             'CALABASH LOUNGE': 1,\n",
       "             'CANWEST Center': 1,\n",
       "             'CLUB BLU': 1,\n",
       "             'Casitas': 1,\n",
       "             'Cathedral of Learning G24': 1,\n",
       "             'Champlain campus': 1,\n",
       "             'Costa Lounge': 2,\n",
       "             \"Dick 's Carpet\": 1,\n",
       "             'Dicks': 1,\n",
       "             'Dillion High School': 1,\n",
       "             'Dim Mak': 1,\n",
       "             'Disney World': 1,\n",
       "             'Disney world': 1,\n",
       "             'ESB': 2,\n",
       "             'Elements': 1,\n",
       "             'Elliot Miner': 1,\n",
       "             'Empire State Building': 2,\n",
       "             'Fight Club': 1,\n",
       "             'First Baptist': 1,\n",
       "             'Forgotten Door': 1,\n",
       "             'Futoosh': 1,\n",
       "             'Gardens': 1,\n",
       "             'Glenveagh national park': 1,\n",
       "             'Glover Park': 1,\n",
       "             'Good Day Sunshine': 1,\n",
       "             'HSBC center': 1,\n",
       "             'Herbivore': 1,\n",
       "             'Hi Pac': 1,\n",
       "             'Holyroodhouse': 1,\n",
       "             'Hotel Elegante': 1,\n",
       "             'Hustler Hollywood': 1,\n",
       "             'Hyde Park Corner': 1,\n",
       "             'JANNUS LIVE': 1,\n",
       "             'Knighttime Billiards': 1,\n",
       "             'Leacock Museum': 1,\n",
       "             'Limelight': 1,\n",
       "             'Loveless Cafe': 1,\n",
       "             'Luna Lounge': 1,\n",
       "             'MAiZE': 1,\n",
       "             'Mai Tai': 1,\n",
       "             'Mississippi Studios': 1,\n",
       "             'Morgan Le Fays': 1,\n",
       "             'Music Box': 1,\n",
       "             'New Life': 1,\n",
       "             'North Phoenix ATA': 1,\n",
       "             'OFF the GRID': 1,\n",
       "             'PITT University': 1,\n",
       "             'Peppers': 1,\n",
       "             'Potbelly Lincoln Park': 1,\n",
       "             'Primal': 1,\n",
       "             'Republic': 1,\n",
       "             'Rotary Club': 1,\n",
       "             'Secrets': 1,\n",
       "             'Southgate House': 1,\n",
       "             'Spoons NW Bistro': 1,\n",
       "             'St Johns': 1,\n",
       "             'Stars': 1,\n",
       "             'Stetson': 1,\n",
       "             'Student Centre': 1,\n",
       "             'Summits': 1,\n",
       "             'Sunset Bistro': 1,\n",
       "             'Swayzes Venue': 1,\n",
       "             'The Echo': 1,\n",
       "             'The Heartbeat': 1,\n",
       "             'The Heatwave': 1,\n",
       "             'The Lodge': 1,\n",
       "             'The Metro': 1,\n",
       "             'The Pub': 1,\n",
       "             'The Standard': 1,\n",
       "             'Tipitina': 1,\n",
       "             'Town Square Place': 1,\n",
       "             'Trophy Club Gwinnett': 1,\n",
       "             'U.S. Capitol building': 1,\n",
       "             'USM': 1,\n",
       "             'VISIONS LOUNGE': 3,\n",
       "             'White Eagle': 1,\n",
       "             'Wildcat Stadium': 1,\n",
       "             'barksdale': 1,\n",
       "             'belles mansion': 1,\n",
       "             'frat house Hattiesburg': 2,\n",
       "             'lhs': 1,\n",
       "             'muse': 1,\n",
       "             'new Teaze cafe': 1,\n",
       "             'the Apollo': 1,\n",
       "             'the Fitzwilliam': 1,\n",
       "             'the funny bone': 1,\n",
       "             'the tall ship Silva': 1})"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "phrase_counts = count_phrases(ptype=\"facility\")\n",
    "phrase_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {'#Vh1': 1,\n",
       "             'AEG': 1,\n",
       "             'AMerican': 1,\n",
       "             'AT&amp;T': 1,\n",
       "             'Accel': 2,\n",
       "             'Amazon U.K.': 1,\n",
       "             'Ball Metal Container': 1,\n",
       "             'Blackberry': 1,\n",
       "             'Brando': 1,\n",
       "             'Business Alliance': 1,\n",
       "             'CORT': 1,\n",
       "             'Camelbak': 1,\n",
       "             'Chevron': 1,\n",
       "             'Cisco': 1,\n",
       "             'Costco': 1,\n",
       "             'Crocs': 1,\n",
       "             'Cyber-Ark': 1,\n",
       "             'Daily Mail': 1,\n",
       "             'Deep Sea Intervention': 2,\n",
       "             'Delphi': 1,\n",
       "             'Disney': 1,\n",
       "             'Electric Lady Studios': 1,\n",
       "             'Engadget': 1,\n",
       "             'EuroVPS': 1,\n",
       "             'Evergreen Subaru': 1,\n",
       "             'FACEBOOK': 1,\n",
       "             'FB': 1,\n",
       "             'Facebook': 6,\n",
       "             'FedEx': 1,\n",
       "             'Forex': 1,\n",
       "             'Gabriel Resources': 1,\n",
       "             'Game Informer': 1,\n",
       "             'Guinness': 1,\n",
       "             'High House Farm': 1,\n",
       "             'Ione': 1,\n",
       "             'Juniper': 1,\n",
       "             'MAC': 1,\n",
       "             'MTS': 1,\n",
       "             'MTV': 1,\n",
       "             'Marie Clair': 1,\n",
       "             'Marlboro Ducati': 1,\n",
       "             'MetroPCS': 1,\n",
       "             'Microsoft': 1,\n",
       "             'Netflix': 1,\n",
       "             'Nex-Tech': 1,\n",
       "             'Nex-Tech Wireless': 1,\n",
       "             'Novel Wines': 1,\n",
       "             'Optimise': 1,\n",
       "             'PAMPERS': 1,\n",
       "             'Pedigree': 1,\n",
       "             'Pepsi': 3,\n",
       "             'Pizza Hut': 1,\n",
       "             'Playboy': 4,\n",
       "             'Pxleyes': 1,\n",
       "             'Reuters': 2,\n",
       "             'Serious Eats': 1,\n",
       "             'Sky 3D': 2,\n",
       "             \"Sotheby | ' s\": 1,\n",
       "             'South 107': 1,\n",
       "             'Starstruck': 1,\n",
       "             'Subway': 1,\n",
       "             'Super Angels': 1,\n",
       "             'Super City': 1,\n",
       "             'TMZ': 2,\n",
       "             'TWITTER': 1,\n",
       "             'Techland': 1,\n",
       "             'Time Warner Cable': 1,\n",
       "             'Twitter': 17,\n",
       "             'UFlex': 1,\n",
       "             'Ustream': 5,\n",
       "             'VEVO': 1,\n",
       "             'Visa': 1,\n",
       "             'Widro': 1,\n",
       "             'Yahoo': 3,\n",
       "             'YouTube': 12,\n",
       "             'facebook': 6,\n",
       "             'fb': 1,\n",
       "             'flipboard': 1,\n",
       "             'gucci': 2,\n",
       "             'iTunes': 3,\n",
       "             'mcdonalds': 1,\n",
       "             'msn': 1,\n",
       "             'myspace': 1,\n",
       "             'ngmoco': 1,\n",
       "             'nintendo': 1,\n",
       "             'old navy': 1,\n",
       "             'port city java': 1,\n",
       "             'sirius': 1,\n",
       "             'skype': 1,\n",
       "             'target': 1,\n",
       "             'tumblr': 1,\n",
       "             'twitpic': 1,\n",
       "             'twitter': 13,\n",
       "             'ufc': 1,\n",
       "             'verizon': 1,\n",
       "             'walmart': 1,\n",
       "             'wipro': 1,\n",
       "             'you tube': 1,\n",
       "             'youtube': 1})"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "phrase_counts = count_phrases(ptype=\"company\")\n",
    "phrase_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {'30stm': 2,\n",
       "             'BORDER LINEA': 1,\n",
       "             'Baka Boyz': 1,\n",
       "             'Big Time Rush': 1,\n",
       "             'Blonde Redhead': 1,\n",
       "             'Breaking &amp; Entering': 1,\n",
       "             \"Cap'n Jazz\": 1,\n",
       "             'CocoFunka': 2,\n",
       "             'Cowboy Mouth': 1,\n",
       "             'Crooked': 1,\n",
       "             'DJ Chris L': 1,\n",
       "             'DJ Elements': 1,\n",
       "             'DJ STRATEGY': 1,\n",
       "             'DOES IT OFFEND YOU , YEAH ?': 1,\n",
       "             'DT': 1,\n",
       "             'Delirious': 1,\n",
       "             'Fall Out Boy': 1,\n",
       "             'Ghostland Observatory': 1,\n",
       "             'Green Day': 1,\n",
       "             'INDIGENOUS': 1,\n",
       "             'Jonas Bros': 1,\n",
       "             'KC and The Sunshine Band': 1,\n",
       "             'KISS': 1,\n",
       "             'Lucid Dementia': 1,\n",
       "             'Maroon 5': 1,\n",
       "             'Maunalua': 1,\n",
       "             'Metallica': 1,\n",
       "             'Mystery Jets': 1,\n",
       "             'N.E.R.D.': 1,\n",
       "             'Natives': 1,\n",
       "             'Primus': 1,\n",
       "             'SHAKEMODE': 1,\n",
       "             'SUENALO': 1,\n",
       "             'Shpongle': 1,\n",
       "             'Simon Posford': 1,\n",
       "             'Sunny Day Real Estate': 1,\n",
       "             'Testing Tomorrow': 2,\n",
       "             'The Get Up Kids': 1,\n",
       "             'The Maccabees': 1,\n",
       "             'Three Days Grace': 1,\n",
       "             'UNKLE': 1,\n",
       "             'Y La Bamba': 1,\n",
       "             'Younger Brother': 1,\n",
       "             'Zac Brown Band': 1,\n",
       "             'green day': 1,\n",
       "             'kings and queens': 2,\n",
       "             'kings of leon': 2,\n",
       "             'the Rza': 1})"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "phrase_counts = count_phrases(ptype=\"musicartist\")\n",
    "phrase_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {'#Vh1': 1,\n",
       "             '11 Bantam Draft': 1,\n",
       "             '1st Presbyterian': 1,\n",
       "             '30+ Beautiful Ladies': 1,\n",
       "             '6th Biannual 24 Hour Prayer Focus': 1,\n",
       "             'A Different Kind Of Ache': 1,\n",
       "             'A&amp;E': 2,\n",
       "             'ACF Friday Large Group': 1,\n",
       "             'ADHD': 1,\n",
       "             'AHFA': 1,\n",
       "             'AP': 1,\n",
       "             'APO': 1,\n",
       "             'Above&amp;Beyond': 1,\n",
       "             'Alejandro': 1,\n",
       "             'American Professional Football Association': 1,\n",
       "             'Army Run': 1,\n",
       "             'AstronomersWithoutBorders': 1,\n",
       "             'Atlas Shrugs': 1,\n",
       "             'Awakening Festival': 1,\n",
       "             'BBL': 1,\n",
       "             'BCAT': 1,\n",
       "             \"Badass Babes Savin ' The Day\": 1,\n",
       "             'Battle of Britain': 1,\n",
       "             'Blakehurst': 1,\n",
       "             'Body Heat': 1,\n",
       "             'CCU': 1,\n",
       "             'COLLEGE REPUBLICAN TAILGATE': 1,\n",
       "             'COP': 1,\n",
       "             'CampaignMonitor': 1,\n",
       "             'Centennial College': 1,\n",
       "             'Christmas': 3,\n",
       "             'CityTV': 1,\n",
       "             'Civil House': 1,\n",
       "             'Constitution Day': 3,\n",
       "             'Copa Sudamericana': 1,\n",
       "             'Costa do Sauipe': 1,\n",
       "             'DADT': 1,\n",
       "             'DJ Got Us Fallin in Love': 1,\n",
       "             'DOUCHEBAG LOS ANGELES OPENING WEEKEND EXTRAVAGANZA': 1,\n",
       "             'Dancing Queen': 1,\n",
       "             'Danger Days : The True Lives Of The Fabulous Killjoys': 2,\n",
       "             'Danger Days : True Lives of The Fabulous Killjoys': 1,\n",
       "             'Dems': 2,\n",
       "             'Desert Rd': 1,\n",
       "             'DoE': 1,\n",
       "             'EBL': 1,\n",
       "             'East Nu': 1,\n",
       "             'Ecclesiastes 7:14': 1,\n",
       "             'Edmonton Journal': 1,\n",
       "             'Epix Movie Channel': 1,\n",
       "             'Erenice War': 1,\n",
       "             'Euroleague': 1,\n",
       "             'FDA': 2,\n",
       "             'Fashion week': 1,\n",
       "             'Fathers Day': 2,\n",
       "             'Fearless Tour': 1,\n",
       "             'Figgy Soda': 1,\n",
       "             'First Day of Autumn Networking Mixer': 1,\n",
       "             'GOP': 2,\n",
       "             'GOTTA BODY': 1,\n",
       "             'Good Friday': 1,\n",
       "             'Grammar Boot Camp': 1,\n",
       "             'Grand Masquerade': 1,\n",
       "             'Great Pumpkin': 1,\n",
       "             'Greek Festival': 2,\n",
       "             'GuitarNews': 1,\n",
       "             'HCA 8K': 1,\n",
       "             'Hangeland': 1,\n",
       "             \"He 's Just A Boy\": 1,\n",
       "             'Homecoming Week': 1,\n",
       "             'HoveLive Festival': 1,\n",
       "             'Hudson TMA': 1,\n",
       "             'Hurricane Earl': 1,\n",
       "             'Hyper V': 1,\n",
       "             'Information Underground': 2,\n",
       "             'International Observe the Moon Night': 1,\n",
       "             \"It 's Gucci Time\": 1,\n",
       "             'June Fest 2010': 1,\n",
       "             'Jup Force PvP': 1,\n",
       "             'K ! Tour': 1,\n",
       "             'KLH': 1,\n",
       "             'Korean War': 1,\n",
       "             'LIVESTRONG Day': 1,\n",
       "             'Land of Talk': 1,\n",
       "             'Lebowski Fest': 1,\n",
       "             'Leroy Jenkins Day': 1,\n",
       "             'Lingerie football league': 1,\n",
       "             'London Fashion Week': 1,\n",
       "             'MLB': 1,\n",
       "             'MOTOGP': 1,\n",
       "             'MRB @ 61 Roadhouse BBQ': 1,\n",
       "             'MTV': 1,\n",
       "             'Manning Bowl I': 1,\n",
       "             'Mark 14:30': 1,\n",
       "             'Mayan Doomsday': 1,\n",
       "             'Miami 2 Ibiza': 1,\n",
       "             'Milford Rd': 1,\n",
       "             'Mothers Day': 1,\n",
       "             'Mumpreneur Awards': 1,\n",
       "             'Music News Net': 1,\n",
       "             'NBA': 1,\n",
       "             'NC St': 1,\n",
       "             'NFL)': 1,\n",
       "             'NRSC': 1,\n",
       "             'National POW/MIA Recognition Day': 1,\n",
       "             'New York Fashion Week': 2,\n",
       "             'Not a Genuine Black Man': 1,\n",
       "             'Oh Canada Team': 1,\n",
       "             'On My Own Time': 1,\n",
       "             'Orange &amp; Blue Skate': 1,\n",
       "             'PLANO BALLOON FESTIVAL 2010': 1,\n",
       "             'Park(ing ) Day': 1,\n",
       "             'Parking Day': 2,\n",
       "             'Pisces': 1,\n",
       "             'Plano Balloon Festival': 1,\n",
       "             \"R'lyeh,\": 1,\n",
       "             'Radio 1': 1,\n",
       "             'Rainy Days': 1,\n",
       "             \"Remington 's Annual Octoberfest\": 1,\n",
       "             'RockHouse': 1,\n",
       "             'Romeo &amp; Julliet': 1,\n",
       "             'S8 blog': 1,\n",
       "             'SB1070': 1,\n",
       "             'SEDUCTION SATURDAYS': 1,\n",
       "             'SKY news': 1,\n",
       "             'SS 2011 Fashion Week': 1,\n",
       "             'SUPAHFEST': 1,\n",
       "             'Sage Award': 1,\n",
       "             'San Jose Mercury News': 1,\n",
       "             'Serie A': 1,\n",
       "             'Simon &amp; Schuster': 1,\n",
       "             'Small Biz Tech Tour 2010': 1,\n",
       "             'Society of Mining and Metallurgy Engineers': 2,\n",
       "             'Software Freedom Day': 1,\n",
       "             'Special Olympics': 1,\n",
       "             'Spirit Week': 1,\n",
       "             'Sports Day': 1,\n",
       "             \"St . Luke 's\": 1,\n",
       "             \"St . Patty 's Day\": 1,\n",
       "             \"St John 's wort\": 1,\n",
       "             'Stakes Day': 1,\n",
       "             'Stanford': 1,\n",
       "             'Swagger/Kofi Match': 1,\n",
       "             \"TEACHER'S DAY 2\": 2,\n",
       "             'TIFF': 1,\n",
       "             'TIFO CREW': 1,\n",
       "             'TailGreat': 1,\n",
       "             'Teenage Dream': 1,\n",
       "             'Tenshi ni Fureta yo': 1,\n",
       "             'Thanksgiving': 2,\n",
       "             'The 31st annual Plano Balloon Festival': 1,\n",
       "             'The DeAndre Way': 3,\n",
       "             'The Devils Art': 1,\n",
       "             'The Drama Edition': 1,\n",
       "             'This Is Our Someday': 1,\n",
       "             'Tournament of Champions': 1,\n",
       "             'Trance Around The World 338 ( Wippenberg Guestmix )': 1,\n",
       "             'Treasury': 2,\n",
       "             'UI': 1,\n",
       "             'UMass': 1,\n",
       "             'UNC': 1,\n",
       "             'USM': 1,\n",
       "             'Ultimate Magic': 1,\n",
       "             'Unsung': 1,\n",
       "             'Until the End of Time': 1,\n",
       "             'VMAs': 2,\n",
       "             'VMB awards': 1,\n",
       "             'Valentines Day': 1,\n",
       "             'Vuelter blog': 1,\n",
       "             'Walk at Work Day': 1,\n",
       "             'Water': 1,\n",
       "             'We All Are Dancing': 1,\n",
       "             'Whole Hog Dinner': 1,\n",
       "             'With You': 1,\n",
       "             'Words and Wine': 1,\n",
       "             'World War 3': 1,\n",
       "             'Wright family': 1,\n",
       "             'YOM KIPPUR': 1,\n",
       "             'Yom Kippur': 5,\n",
       "             'buckboardday parade': 1,\n",
       "             'christmas': 1,\n",
       "             'dynamite': 1,\n",
       "             'eBGP': 1,\n",
       "             'goodbye moon': 1,\n",
       "             'iBGP': 1,\n",
       "             'imdb': 1,\n",
       "             'london fashion week': 2,\n",
       "             'mothers day': 1,\n",
       "             'national flat picking championship': 1,\n",
       "             'strokes of genius': 1,\n",
       "             'when im gone': 1,\n",
       "             'xmas': 3,\n",
       "             'yakuza': 1})"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "phrase_counts = count_phrases(ptype=\"other\")\n",
    "phrase_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Tag(token='Made', tag='O'),\n",
       " Tag(token='it', tag='O'),\n",
       " Tag(token='back', tag='O'),\n",
       " Tag(token='home', tag='O'),\n",
       " Tag(token='to', tag='O'),\n",
       " Tag(token='GA', tag='U-geo-loc'),\n",
       " Tag(token='.', tag='O'),\n",
       " Tag(token='It', tag='O'),\n",
       " Tag(token='sucks', tag='O'),\n",
       " Tag(token='not', tag='O'),\n",
       " Tag(token='to', tag='O'),\n",
       " Tag(token='be', tag='O'),\n",
       " Tag(token='at', tag='O'),\n",
       " Tag(token='Disney', tag='B-facility'),\n",
       " Tag(token='world', tag='E-facility'),\n",
       " Tag(token=',', tag='O'),\n",
       " Tag(token='but', tag='O'),\n",
       " Tag(token='its', tag='O'),\n",
       " Tag(token='good', tag='O'),\n",
       " Tag(token='to', tag='O'),\n",
       " Tag(token='be', tag='O'),\n",
       " Tag(token='home', tag='O'),\n",
       " Tag(token='.', tag='O'),\n",
       " Tag(token='Time', tag='O'),\n",
       " Tag(token='to', tag='O'),\n",
       " Tag(token='start', tag='O'),\n",
       " Tag(token='planning', tag='O'),\n",
       " Tag(token='the', tag='O'),\n",
       " Tag(token='next', tag='O'),\n",
       " Tag(token='Disney', tag='B-facility'),\n",
       " Tag(token='World', tag='E-facility'),\n",
       " Tag(token='trip', tag='O'),\n",
       " Tag(token='.', tag='O')]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "to_BIEOU(sequences[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "write_sequences(sequences, \"data/cleaned/train.BIEOU.tsv\", to_bieou=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Validation set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sequences, tag_count = load_sequences(\"data/cleaned/dev.tsv\", sep=\"\\t\")\n",
    "write_sequences(sequences, \"data/cleaned/dev.BIEOU.tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sequences, tag_count = load_sequences(\"data/cleaned/dev.tsv\", sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Tag(token='Happy', tag='O'), Tag(token='Good', tag='B-other'), Tag(token='Friday', tag='I-other')]\n"
     ]
    }
   ],
   "source": [
    "for seq in sequences:\n",
    "    if seq[0].token == \"Happy\":\n",
    "        print seq\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Tag(token='Happy', tag='O'),\n",
       " Tag(token='Good', tag='B-other'),\n",
       " Tag(token='Friday', tag='E-other')]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "to_BIEOU(seq)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Validation 2015"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sequences, tag_count = load_sequences(\"data/cleaned/dev_2015.tsv\", sep=\"\\t\")\n",
    "write_sequences(sequences, \"data/cleaned/dev_2015.BIEOU.tsv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sequences, tag_count = load_sequences(\"data/cleaned/test.tsv\", sep=\"\\t\")\n",
    "write_sequences(sequences, \"data/cleaned/test.BIEOU.tsv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# No Types"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sequences, tag_count = load_sequences(\"data/cleaned/train.tsv\", sep=\"\\t\", notypes=True)\n",
    "write_sequences(sequences, \"data/cleaned/train_notypes.tsv\", to_bieou=False)\n",
    "\n",
    "sequences, tag_count = load_sequences(\"data/cleaned/train.BIEOU.tsv\", sep=\"\\t\", notypes=True)\n",
    "write_sequences(sequences, \"data/cleaned/train_notypes.BIEOU.tsv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sequences, tag_count = load_sequences(\"data/cleaned/dev.tsv\", sep=\"\\t\", notypes=True)\n",
    "write_sequences(sequences, \"data/cleaned/dev_notypes.tsv\", to_bieou=False)\n",
    "\n",
    "sequences, tag_count = load_sequences(\"data/cleaned/dev.BIEOU.tsv\", sep=\"\\t\", notypes=True)\n",
    "write_sequences(sequences, \"data/cleaned/dev_notypes.BIEOU.tsv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Validation 2015"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sequences, tag_count = load_sequences(\"data/cleaned/dev_2015.tsv\", sep=\"\\t\", notypes=True)\n",
    "write_sequences(sequences, \"data/cleaned/dev_2015_notypes.tsv\", to_bieou=False)\n",
    "\n",
    "sequences, tag_count = load_sequences(\"data/cleaned/dev_2015.BIEOU.tsv\", sep=\"\\t\", notypes=True)\n",
    "write_sequences(sequences, \"data/cleaned/dev_2015_notypes.BIEOU.tsv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sequences, tag_count = load_sequences(\"data/cleaned/test.tsv\", sep=\"\\t\", notypes=True)\n",
    "write_sequences(sequences, \"data/cleaned/test_notypes.tsv\", to_bieou=False)\n",
    "\n",
    "sequences, tag_count = load_sequences(\"data/cleaned/test.BIEOU.tsv\", sep=\"\\t\", notypes=True)\n",
    "write_sequences(sequences, \"data/cleaned/test_notypes.BIEOU.tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
